#############################################
## PROGRAM NAME: 206_misc_checks.R         ##
## AUTHOR: MATT MLECZKO                    ##
## INPUTS:                                 ##
##         201_af_muni_fin.Rda             ##
##         100_af_muni.Rda                 ##
##         pl_to_county.csv                ##
##         cs_to_county.csv                ##
##                                         ##
## OUTPUTS:                                ##
##                                         ## 
## PURPOSE: Carry out miscellaneous checks ##
## LIST OF UPDATES:                        ##
#############################################

#log <- file(# USER DEFINED PATH AND FILE NAME HERE #)
#sink(log, append=TRUE)
#sink(log, append=TRUE, type="message")

set.seed(08542)

## load libraries ##

library(tidyverse)
library(foreign)
library(stringr)
library(tm)
library(gdata)
library(gsubfn)
library(haven)
library(readxl)
library(mice)
library(Rcpp)
library(basecamb)

## define paths ##
data_path <- # USER DEFINED PATH HERE #

## set working directory ##
setwd(data_path)

`%notin%` <- Negate(`%in%`)

## create a merge function that creates merge frequency as in Stata ##
## from user rwbuie at stackoverflow: https://stackoverflow.com/questions/30358401/is-there-a-way-to-create-statas-merge-indicator-variable-with-rs-merge ##
stata.merge <- function(x,y, name){
  x$df1 <- 1
  y$df2 <- 2
  df <- merge(x,y, by = name, all = TRUE)
  df$merge.variable <- rowSums(df[,c("df1", "df2")], na.rm=TRUE)
  df$df1 <- NULL
  df$df2<- NULL
  df
  #print(table(df$merge.variable))
  
  ## return the merged dataframe
  return(df)
}

## load analytic files ##

load("201_af_muni_fin.Rda")

###############################################
## check 1: geographic spread of ZRI changes ## 
###############################################

## zri increases ##
inc <- af.muni.fin %>%
  mutate(diff = zri_2022_cb - zri_2006_cb) %>%
  filter(diff > 0.5)

## zri decreases ##
dec <- af.muni.fin %>%
  mutate(diff = zri_2022_cb - zri_2006_cb) %>%
  filter(diff < - 0.5)

## increases by region ##
inc.reg <- inc %>%
  mutate(region =case_when(
    substr(GEOID,1,2) %in% c("09","23","25","44","50","34","36","42","33") ~ "NE",
    substr(GEOID,1,2) %in% c("17","18","26","39","55","19","20","27","29","31","38","46") ~ "MW",
    substr(GEOID,1,2) %in% c("10","11","12","13","24","37","45","51","54","01","21","28","47","05","22","40","48") ~ "SO",
    substr(GEOID,1,2) %in% c("08","56","16","30","35","06","15","53","02","49","32","41","04") ~ "WE"))

prop.table(table(inc.reg$region))

## decreases by region ##
dec.reg <- dec %>%
  mutate(region =case_when(
    substr(GEOID,1,2) %in% c("09","23","25","44","50","34","36","42","33") ~ "NE",
    substr(GEOID,1,2) %in% c("17","18","26","39","55","19","20","27","29","31","38","46") ~ "MW",
    substr(GEOID,1,2) %in% c("10","11","12","13","24","37","45","51","54","01","21","28","47","05","22","40","48") ~ "SO",
    substr(GEOID,1,2) %in% c("08","56","16","30","35","06","15","53","02","49","32","41","04") ~ "WE"))

prop.table(table(dec.reg$region))

###############################################
## check 2: county level NZLUD summary stats ##
###############################################

#load("USER DEFINED PATH/100_af_muni.Rda")

#pl.to.county.in <- read.csv("USER DEFINED PATH/pl_to_county.csv")
#cs.to.county.in <- read.csv("USER DEFINED PATH/cs_to_county.csv")

## corrections ##
cs.to.county.in$cousubfp[cs.to.county.in$cousubfp == "78972"] <- "78865"
cs.to.county.in$cousubfp[cs.to.county.in$cousubfp == "83090"] <- "83111"

## places first ## 

pl.to.county <- pl.to.county.in %>%
  filter(state != "State code") %>%
  mutate(GEOID = paste0(state,placefp),
         afact_num = as.numeric(afact))

range(nchar(pl.to.county$state))
range(nchar(pl.to.county$placefp))
range(nchar(pl.to.county$GEOID))

length(unique(af.muni.metro$GEOID)) == nrow(af.muni.metro)
length(unique(pl.to.county$GEOID)) == nrow(pl.to.county)

pls.m <- stata.merge(pl.to.county,
                     af.muni.metro,
                     "GEOID")

table(pls.m$merge.variable)

miss <- pls.m %>%
  filter(merge.variable == 2)

pls <- pls.m %>%
  filter(merge.variable == 3 & 
           afact_num > 0.1) %>%
  rename(FIPS = county)

range(nchar(pls$FIPS))
length(unique(pls$FIPS))

pl.dups <- pls %>%
  group_by(GEOID_full) %>%
  summarize(n = n()) %>%
  filter(n>1)

## now county subs ##

cs.to.county <- cs.to.county.in %>%
  filter(county != "County code") %>%
  mutate(st_fips = substr(county,1,2),
         GEOID = paste0(st_fips,cousubfp),
         afact_num = as.numeric(afact))

range(nchar(cs.to.county$st_fips))
range(nchar(cs.to.county$cousubfp))
range(nchar(cs.to.county$GEOID))

length(unique(cs.to.county$GEOID)) == nrow(cs.to.county)

cs.m <- stata.merge(cs.to.county,
                    af.muni.metro,
                    "GEOID")

table(cs.m$merge.variable)


cs <- cs.m %>%
  filter(merge.variable == 3 & 
           GEOID %notin% pls$GEOID) %>%
  rename(FIPS = county)

length(unique(cs$FIPS))

## combine ##

county.full <- bind_rows(pls,
                         cs)

length(unique(county.full$GEOID)) 
length(unique(af.muni.metro$GEOID)) 

gap <- af.muni.metro %>%
  anti_join(county.full, "GEOID")

## first, within county dispersion ## 

county.disp <- county.full %>%
  group_by(FIPS) %>%
  summarize(name = first(cntyname),
            responses = n(),
            sdzri_2006 = sd(zri_st_2006,na.rm=T),
            sdzri_2022 = sd(zri_st_2022,na.rm=T)) %>%
  filter(!is.na(sdzri_2006) | !is.na(sdzri_2006))

sum(county.disp$responses)
summary(county.disp$sdzri_2006)
summary(county.disp$sdzri_2022)

## next, county-level summary stats ##

county <- county.full %>%
  group_by(FIPS) %>%
  summarize(name = first(cntyname),
            responses = n(),
            sindex1_2006 = median(sindex1_2006,na.rm=T),
            sindex2_2006 = median(sindex2_2006,na.rm=T),
            sindex3_2006 = median(sindex3_2006,na.rm=T),
            sindex4_2006 = median(sindex4_2006,na.rm=T),
            sindex5_2006 = median(sindex5_2006,na.rm=T),
            sindex6_2006 = median(sindex6_2006,na.rm=T),
            sindex7_2006 = median(sindex7_2006,na.rm=T),
            zri_2006 = median(zri_2006,na.rm=T),
            sindex1_2022 = median(sindex1_2022,na.rm=T),
            sindex2_2022 = median(sindex2_2022,na.rm=T),
            sindex3_2022 = median(sindex3_2022,na.rm=T),
            sindex4_2022 = median(sindex4_2022,na.rm=T),
            sindex5_2022 = median(sindex5_2022,na.rm=T),
            sindex6_2022 = median(sindex6_2022,na.rm=T),
            sindex7_2022 = median(sindex7_2022,na.rm=T),
            zri_2022 = median(zri_2022,na.rm=T))

sum(county$responses)
summary(county$responses)

county$zri_st_2006 <- (county$zri_2006 - mean(county$zri_2006, na.rm=T))/sd(county$zri_2006,na.rm=T)
county$zri_st_2022 <- (county$zri_2022 - mean(county$zri_2022, na.rm=T))/sd(county$zri_2022,na.rm=T)

## results ##

summary(county$sindex1_2006)
sd(county$sindex1_2006,na.rm=T)
summary(county$sindex2_2006)
sd(county$sindex2_2006,na.rm=T)
summary(county$sindex3_2006)
sd(county$sindex3_2006,na.rm=T)
summary(county$sindex4_2006)
sd(county$sindex4_2006,na.rm=T)    
summary(county$sindex5_2006)
sd(county$sindex5_2006,na.rm=T) 
summary(county$sindex6_2006)
sd(county$sindex6_2006,na.rm=T) 
summary(county$sindex7_2006)
sd(county$sindex7_2006,na.rm=T) 
summary(county$zri_st_2006,na.rm=T)
sd(county$zri_st_2006,na.rm=T) 

summary(county$sindex1_2022)
sd(county$sindex1_2022,na.rm=T)
summary(county$sindex2_2022)
sd(county$sindex2_2022,na.rm=T)
summary(county$sindex3_2022)
sd(county$sindex3_2022,na.rm=T)
summary(county$sindex4_2022)
sd(county$sindex4_2022,na.rm=T)    
summary(county$sindex5_2022)
sd(county$sindex5_2022,na.rm=T) 
summary(county$sindex6_2022)
sd(county$sindex6_2022,na.rm=T) 
summary(county$sindex7_2022)
sd(county$sindex7_2022,na.rm=T) 
summary(county$zri_st_2022,na.rm=T)
sd(county$zri_st_2022,na.rm=T)

## END OF PROGRAM ##
#sink() 

